If you are new yo R, make sure to read these info boxes.
First load some dependencies and create the output directory:
In R, packages can be loaded using library(). If a package is not installed, you can install it from CRAN using install.packages(). The dplyr package is a commonly used package for data manipulation, readxl is required for reading the Excel file.
list.files("./dataset", full.names = "TRUE")
## [1] "./dataset/metadata.txt" "./dataset/samples.xlsx"
## [3] "./dataset/seqtab.txt" "./dataset/sequences.fasta"
## [5] "./dataset/taxonomy.txt"
In a relative file path, .. indicates the parent directory.
../dataset/seqtab.txt contains the ASV table, so it has one row per ASV, and the number of reads in a sample in different columns.
seqtab <- read.table("./dataset/seqtab.txt", sep = "\t", header = TRUE)
rmarkdown::paged_table(seqtab)
read.table() reads a delimited text file to a data frame. sep = “ means that the file is tab delimited.
../dataset/taxonomy.txt contains a taxon name for each ASV.
taxonomy <- read.table("./dataset/taxonomy.txt", sep = "\t", header = TRUE)
paged_table(taxonomy)
These names originate from the reference database and will have to be matched to WoRMS later.
We also have an Excel file with sample info.
samples <- read_excel("./dataset/samples.xlsx")
samples
## # A tibble: 2 × 11
## name size event_begin area_name area_longitude area_latitude
## <chr> <dbl> <chr> <chr> <dbl> <dbl>
## 1 EE0493 1450 24/04/2023 Ile esprit 46.2 9.43
## 2 EE0495 1500 02/04/2023 Settlement beach 46.2 9.40
## # ℹ 5 more variables: area_uncertainty <dbl>, parent_area_name <chr>,
## # dna <dbl>, depth <dbl>, temperature <dbl>
At this point we could start quality control on the individual tables, but if we first join and map the tables to Darwin Core occurrence terms, the quality control code will be easier to read.
Let’s start with the sample table. This table has sample identifiers, time, coordinates, coordinate uncertainty, locality, and higher geography which can all be mapped to Darwin Core. Keep the dna field for later.
event <- samples %>%
select(
eventID = name,
materialSampleID = name,
eventDate = event_begin,
locality = area_name,
decimalLongitude = area_longitude,
decimalLatitude = area_latitude,
coordinateUncertaintyInMeters = area_uncertainty,
higherGeography = parent_area_name,
minimumDepthInMeters = depth,
maximumDepthInMeters = depth,
sampleSizeValue = size,
dna,
temperature
) %>%
mutate(sampleSizeUnit = "ml")
event
## # A tibble: 2 × 14
## eventID materialSampleID eventDate locality decimalLongitude decimalLatitude
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 EE0493 EE0493 24/04/2023 Ile espr… 46.2 9.43
## 2 EE0495 EE0495 02/04/2023 Settleme… 46.2 9.40
## # ℹ 8 more variables: coordinateUncertaintyInMeters <dbl>,
## # higherGeography <chr>, minimumDepthInMeters <dbl>,
## # maximumDepthInMeters <dbl>, sampleSizeValue <dbl>, dna <dbl>,
## # temperature <dbl>, sampleSizeUnit <chr>
The code above uses several dlyr functions. select() selects and optionally renames a set of columns from the dataframe. mutate() creates a new column. %>% is the pipe operator which is used to string functions together.
Next is the ASV table. This table is in a wide format with ASVs as rows and samples as columns. We will convert this to a long format, with one row per occurrence and the number of sequence reads as organismQuantity. We will use the sample identifier as eventID and the combination of sample identifier and ASV number as the occurrenceID.
To do from a wide to a long table, use the gather() function from the tidyr package. paste0() is used to combine character strings.
#library(tidyr)
occurrence <- seqtab %>%
gather(eventID, organismQuantity, 2:3) %>%
filter(organismQuantity > 0) %>%
mutate(
occurrenceID = paste0(eventID, "_", asv),
organismQuantityType = "sequence reads"
)
paged_table(occurrence)
We can now add the taxonomic names to our occurrence table.
taxonomy <- taxonomy %>%
select(asv, verbatimIdentification = taxonomy)
occurrence <- occurrence %>%
left_join(taxonomy, by = "asv")
paged_table(occurrence)
left_join() joins two dataframes by matching columns. The by argument specifies the columns to match on.
occurrence <- event %>%
left_join(occurrence, by = "eventID")
paged_table(occurrence)
Populate samplingProtocol with a link the the eDNA Expeditions protocol.
occurrence$samplingProtocol <- "https://github.com/BeBOP-OBON/UNESCO_protocol_collection"
Let’s first match the taxa with WoRMS. This can be done using the obistools package. Before matching with WoRMS we will remove underscores from the scientific names.
taxon_names <- stringr::str_replace(occurrence$verbatimIdentification, "_", " ")
Now match the names, this can take a few minutes.
matched <- obistools::match_taxa(taxon_names, ask = FALSE) %>%
select(scientificName, scientificNameID)
## 433 names, 0 without matches, 11 with multiple matches
#The solution can be found in this file:
#matched <- read.table("./solutions/matched_results.txt", sep = "\t", header = T)
paged_table(matched)
occurrence <- bind_cols(occurrence, matched)
paged_table(occurrence)
non_matches <- occurrence %>%
filter(is.na(scientificNameID)) %>%
group_by(verbatimIdentification) %>%
summarize(n = n()) %>%
arrange(desc(n))
write.table(non_matches, file = file.path(output_dir, "nonmatches.txt"), sep = "\t", row.names = FALSE, na = "", quote = FALSE)
paged_table(non_matches)
Normally we have to resolve these names one by one, but for this exercise we will just fix the most common errors. For example, records annotated as eukaryotes can be populated with scientificName Incertae sedis and scientificNameID urn:lsid:marinespecies.org:taxname:12.
occurrence <- occurrence %>%
mutate(
scientificName = case_when(verbatimIdentification %in% c("Eukaryota", "undef_Eukaryota", "") ~ "Incertae sedis", .default = scientificName),
scientificNameID = case_when(verbatimIdentification %in% c("Eukaryota", "undef_Eukaryota", "") ~ "urn:lsid:marinespecies.org:taxname:12", .default = scientificNameID)
)
OBIS will automatically link higher taxonomic levels based on the worms IDS. However, with the following workflow you can add them also.
#library(dplyr)
#library(purrr)
dummy_data <- occurrence %>%
select(scientificName, scientificNameID) %>%
mutate(aphiaid = as.numeric(stringr::str_extract(scientificNameID, "\\d+$")))
taxonomy_worrms <- map(unique(dummy_data$aphiaid[!is.na(dummy_data$aphiaid)]), worrms::wm_record) %>%
bind_rows() %>%
select(AphiaID, kingdom, phylum, class, order, family, genus, scientificname, rank)
dummy_data <- dummy_data %>%
left_join(taxonomy_worrms, by = c("aphiaid" = "AphiaID"))
occurrence <- bind_cols(occurrence, dummy_data %>% select(kingdom, phylum, class, order, family, genus, rank))
paged_table(occurrence)
Now let’s check the coordinates by plotting the distinct coordinate pairs on a map.
#library(leaflet)
stations <- occurrence %>%
distinct(locality, decimalLongitude, decimalLatitude)
stations
## # A tibble: 2 × 3
## locality decimalLongitude decimalLatitude
## <chr> <dbl> <dbl>
## 1 Ile esprit 46.2 9.43
## 2 Settlement beach 46.2 9.40
leaflet() %>%
addTiles() %>%
addMarkers(lng = stations$decimalLongitude, lat = stations$decimalLatitude, popup = stations$locality)
There’s clearly something wrong with the coordinates. Longitude looks fine, let’s try flipping latitude.
occurrence <- occurrence %>%
mutate(decimalLatitude = -decimalLatitude)
stations <- occurrence %>%
distinct(locality, decimalLongitude, decimalLatitude)
stations
## # A tibble: 2 × 3
## locality decimalLongitude decimalLatitude
## <chr> <dbl> <dbl>
## 1 Ile esprit 46.2 -9.43
## 2 Settlement beach 46.2 -9.40
leaflet() %>%
addTiles() %>%
addMarkers(lng = stations$decimalLongitude, lat = stations$decimalLatitude, popup = stations$locality)
Now fix the occurrence table.
Now check the event dates.
obistools::check_eventdate(occurrence)
## # A tibble: 13,375 × 4
## level row field message
## <chr> <int> <chr> <chr>
## 1 error 1 eventDate eventDate 24/04/2023 does not seem to be a valid date
## 2 error 2 eventDate eventDate 24/04/2023 does not seem to be a valid date
## 3 error 3 eventDate eventDate 24/04/2023 does not seem to be a valid date
## 4 error 4 eventDate eventDate 24/04/2023 does not seem to be a valid date
## 5 error 5 eventDate eventDate 24/04/2023 does not seem to be a valid date
## 6 error 6 eventDate eventDate 24/04/2023 does not seem to be a valid date
## 7 error 7 eventDate eventDate 24/04/2023 does not seem to be a valid date
## 8 error 8 eventDate eventDate 24/04/2023 does not seem to be a valid date
## 9 error 9 eventDate eventDate 24/04/2023 does not seem to be a valid date
## 10 error 10 eventDate eventDate 24/04/2023 does not seem to be a valid date
## # ℹ 13,365 more rows
It looks like eventDate is in the wrong format. Use the lubridate package to parse the current date format and change it.
#library(lubridate)
occurrence <- occurrence %>%
mutate(eventDate = format_ISO8601(parse_date_time(eventDate, "%d/%m/%Y"), precision = "ymd", usetz = FALSE))
unique(occurrence$eventDate)
## [1] "2023-04-24" "2023-04-02"
head(occurrence)
## # A tibble: 6 × 29
## eventID materialSampleID eventDate locality decimalLongitude decimalLatitude
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 EE0493 EE0493 2023-04-24 Ile espr… 46.2 -9.43
## 2 EE0493 EE0493 2023-04-24 Ile espr… 46.2 -9.43
## 3 EE0493 EE0493 2023-04-24 Ile espr… 46.2 -9.43
## 4 EE0493 EE0493 2023-04-24 Ile espr… 46.2 -9.43
## 5 EE0493 EE0493 2023-04-24 Ile espr… 46.2 -9.43
## 6 EE0493 EE0493 2023-04-24 Ile espr… 46.2 -9.43
## # ℹ 23 more variables: coordinateUncertaintyInMeters <dbl>,
## # higherGeography <chr>, minimumDepthInMeters <dbl>,
## # maximumDepthInMeters <dbl>, sampleSizeValue <dbl>, dna <dbl>,
## # temperature <dbl>, sampleSizeUnit <chr>, asv <chr>, organismQuantity <int>,
## # occurrenceID <chr>, organismQuantityType <chr>,
## # verbatimIdentification <chr>, samplingProtocol <chr>, scientificName <chr>,
## # scientificNameID <chr>, kingdom <chr>, phylum <chr>, class <chr>, …
Let’s check if any required fields are missing.
obistools::check_fields(occurrence)
## # A tibble: 2 × 4
## level field row message
## <chr> <chr> <lgl> <chr>
## 1 error occurrenceStatus NA Required field occurrenceStatus is missing
## 2 error basisOfRecord NA Required field basisOfRecord is missing
occurrence <- occurrence %>%
mutate(
occurrenceStatus = "present",
basisOfRecord = "MaterialSample"
)
We have several measurements that can be added to the MeasurementOrFact extension: sequence reads, sample volume, and DNA extract concentration.
mof_reads <- occurrence %>%
select(occurrenceID, measurementValue = organismQuantity) %>%
mutate(
measurementType = "sequence reads"
)
mof_samplesize <- occurrence %>%
select(occurrenceID, measurementValue = sampleSizeValue, measurementUnit = sampleSizeUnit) %>%
mutate(
measurementType = "sample size",
measurementTypeID = "http://vocab.nerc.ac.uk/collection/P01/current/VOLWBSMP/",
measurementUnit = "ml",
measurementUnitID = "http://vocab.nerc.ac.uk/collection/P06/current/VVML/"
)
mof_dna <- occurrence %>%
select(occurrenceID, measurementValue = dna) %>%
mutate(
measurementType = "DNA concentration",
measurementTypeID = "http://vocab.nerc.ac.uk/collection/P01/current/A260DNAX/",
measurementUnit = "ng/μl",
measurementUnitID = "http://vocab.nerc.ac.uk/collection/P06/current/UNUL/"
)
mof_temperature <- occurrence %>%
select(occurrenceID, measurementValue = temperature) %>%
mutate(
measurementType = "seawater temperature",
measurementTypeID = "http://vocab.nerc.ac.uk/collection/P01/current/TEMPPR01/",
measurementUnit = "degrees Celsius",
measurementUnitID = "http://vocab.nerc.ac.uk/collection/P06/current/UPAA/"
)
mof <- bind_rows(mof_reads, mof_samplesize, mof_dna)
paged_table(mof)
#library(Biostrings)
fasta_file <- readDNAStringSet("./dataset/sequences.fasta")
fasta <- data.frame(asv = names(fasta_file), DNA_sequence = paste(fasta_file))
paged_table(fasta)
dna <- occurrence %>%
select(occurrenceID, asv, concentration = dna) %>%
left_join(fasta, by = "asv")
paged_table(dna)
We have a file with some sequencing metadata, print the file contents and add the corresponding fields to the DNADerivedData table.
cat(paste0(readLines("./dataset/metadata.txt"), collapse = "\n"))
## eDNA Expeditions sequencing info
##
## target gene: COI
## forward primer: GGWACWGGWTGAACWGTWTAYCCYCC
## reverse primer: TANACYTCNGGRTGNCCRAARAAYCA
## forward primer name: mlCOIintF
## reverse primer name: dgHCO2198
## primer reference: doi:10.1186/1742-9994-10-34
## library layout: paired
## sequencing platform: Illumina NovaSeq6000
dna <- dna %>%
mutate(
concentrationUnit = "ng/μl",
lib_layout = "paired",
target_gene = "COI",
pcr_primers = "FWD:GGWACWGGWTGAACWGTWTAYCCYCC;REV:TANACYTCNGGRTGNCCRAARAAYCA",
seq_meth = "Illumina NovaSeq6000",
ref_db = "https://github.com/iobis/edna-reference-databases",
pcr_primer_forward = "GGWACWGGWTGAACWGTWTAYCCYCC",
pcr_primer_reverse = "TANACYTCNGGRTGNCCRAARAAYCA",
pcr_primer_name_forward = "mlCOIintF",
pcr_primer_name_reverse = "dgHCO2198",
pcr_primer_reference = "doi:10.1186/1742-9994-10-34"
) %>%
select(-asv)
paged_table(dna)
Write text files and compress.
occurrence <- occurrence %>%
select(-asv, -dna, -temperature)
write.table(occurrence, file = file.path(output_dir, "occurrence.txt"), sep = "\t", row.names = FALSE, na = "", quote = FALSE)
write.table(mof, file = file.path(output_dir, "measurementorfact.txt"), sep = "\t", row.names = FALSE, na = "", quote = FALSE)
write.table(dna, file = file.path(output_dir, "dnaderiveddata.txt"), sep = "\t", row.names = FALSE, na = "", quote = FALSE)
#remotes::install_github("pieterprovoost/r-dwca-writer")
#library(dwcawriter)
archive <- list(
eml = '<eml:eml packageId="https://obis.org/dummydataset/v1.0" scope="system" system="http://gbif.org" xml:lang="en" xmlns:dc="http://purl.org/dc/terms/" xmlns:eml="eml://ecoinformatics.org/eml-2.1.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="eml://ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.2/eml.xsd">
<dataset>
<title xml:lang="en">Dummy Dataset</title>
</dataset>
</eml:eml>',
core = list(
name = "occurrence",
type = "https://rs.gbif.org/core/dwc_occurrence_2022-02-02.xml",
index = which(names(occurrence) == "occurrenceID"),
data = occurrence
),
extensions = list(
list(
name = "measurementorfact",
type = "https://rs.gbif.org/extension/obis/extended_measurement_or_fact_2023-08-28.xml",
index = which(names(mof) == "occurrenceID"),
data = mof
),
list(
name = "dnaderiveddata",
type = "https://rs.gbif.org/extension/gbif/1.0/dna_derived_data_2022-02-23.xml",
index = which(names(dna) == "occurrenceID"),
data = dna
)
)
)
write_dwca(archive, file.path(output_dir, "archive.zip"))
## [1] TRUE TRUE TRUE TRUE TRUE